Time series, the course I often wish I had taken while completing my coursework in school. I finally got an excuse to do a comparitive dive into the different time series models in the forcast package in R thanks to an invitation to present at a recent Practical Data Science Meetup in Salt Lake City.
library(tidyverse)
library(gridExtra)
library(lubridate)
library(leaflet)
library(randomForest)
library(forecast)
library(prophet)
# Visualize sensor locations
w.sensors <- weather %>% distinct(LATITUDE,LONGITUDE)
pm.sensors <- pm25 %>%
# Remove data from sensors that don't span entire time period
filter(!(AQS_SITE_ID %in% c(490353013,490450003))) %>%
distinct(AQS_SITE_ID,SITE_LATITUDE,SITE_LONGITUDE) %>%
rename(LATITUDE=SITE_LATITUDE,
LONGITUDE=SITE_LONGITUDE)
w.icons <- awesomeIcons(
icon = 'tint',
iconColor = 'blue',
markerColor = "white"
)
pm.icons <- awesomeIcons(
icon = 'cloud',
iconColor = 'gray',
markerColor = "black"
)
m <- leaflet() %>%
addProviderTiles(providers$Esri.NatGeoWorldMap) %>%
setView(-112, 40.7, zoom = 10) %>%
addAwesomeMarkers(data=w.sensors, icon = w.icons) %>%
addAwesomeMarkers(data=pm.sensors, icon = pm.icons,label = ~as.character(AQS_SITE_ID))
m
fit1 <- lm(sqrt(pm2.5)~inversion+wind+precip+fireworks,data=dat)
summary(fit1)
##
## Call:
## lm(formula = sqrt(pm2.5) ~ inversion + wind + precip + fireworks,
## data = dat)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.1571 -0.5555 -0.1835 0.3608 4.4629
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.322431 0.066358 50.068 < 2e-16 ***
## inversion 2.527237 0.130122 19.422 < 2e-16 ***
## wind -0.040543 0.003255 -12.454 < 2e-16 ***
## precip -0.515741 0.175563 -2.938 0.00336 **
## fireworks 0.545624 0.116089 4.700 2.85e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.8791 on 1456 degrees of freedom
## Multiple R-squared: 0.3165, Adjusted R-squared: 0.3146
## F-statistic: 168.5 on 4 and 1456 DF, p-value: < 2.2e-16
dat$resid[!is.na(dat$pm2.5)] <- resid(fit1)
# Plot the residuals
ggplot(dat,aes(date,resid)) +
geom_point() + geom_smooth() +
ggtitle("Linear Regression Residuals",
subtitle = paste0("RMSE: ",round(sqrt(mean(dat$resid^2,na.rm=TRUE)),2)))
Acf(dat$resid, main="ACF of OLS Residuals")
fit2 <- randomForest(sqrt(pm2.5)~inversion+wind+precip+fireworks,data=dat[!is.na(dat$pm2.5),], ntree=500)
dat$rf.resid[!is.na(dat$pm2.5)] <- fit2$predicted - sqrt(dat$pm2.5[!is.na(dat$pm2.5)])
# Plot the residuals
ggplot(dat,aes(date,rf.resid)) +
geom_point() + geom_smooth() +
ggtitle("Random Forest Residuals",
subtitle = paste0("RMSE: ",round(sqrt(fit2$mse[500]),2)))
# Better but we still have some odd things going on in our data
# Zoom In
p1 <- ggplot(dat,aes(date,rf.resid)) +
geom_point() + geom_line() +
xlim(as.Date(c("2014-01-01","2014-02-28"))) +
geom_abline(slope=0, intercept = 0, lty=2, col = "blue", lwd = 1.25)
p2 <- ggplot(dat,aes(date,rf.resid)) +
geom_point() + geom_line() +
xlim(as.Date(c("2017-11-01","2017-12-31"))) +
geom_abline(slope=0, intercept = 0, lty=2, col = "blue", lwd = 1.25)
grid.arrange(p1, p2, ncol=2, top="Zoom-in of Random Forest Residuals")
# Convert to time series data
dat.ts <- sqrt(ts(dat[,"pm2.5"], frequency = 7))
# Exponential smoothing model with weekly seasonality
fit3 <- ets(dat.ts) # model = "MAN"
fit4a <- ets(dat.ts, model ="AAA")
fit4b <- ets(dat.ts, model ="MMM")
# Fit models with all additive or all multiplicative features. First byte is for errors, second for trend, and third for seasonality
# Predict Future Values
plot(predict(fit3,h=25),xlim=c(200,215))
ets.mod <- rbind(data.frame(day=1:sum(!is.na(dat.ts)), resid=as.numeric(residuals(fit3)), type="Auto"),
data.frame(day=1:sum(!is.na(dat.ts)), resid=as.numeric(residuals(fit4a)), type="Additive"),
data.frame(day=1:sum(!is.na(dat.ts)), resid=as.numeric(residuals(fit4b)), type="Multiplicative"))
# Compare the residuals of each model
ggplot(ets.mod,aes(day,resid)) +
geom_point() + geom_smooth() +
facet_grid(type~.,scales="free")+
ggtitle("ETS Residuals with Weekly Seasonality",
subtitle = paste0("Auto RMSE: ",round(sqrt(fit3$mse),2),
" Additive RMSE: ",round(sqrt(fit4a$mse),2),
" Multiplicative RMSE: ",round(sqrt(fit4b$mse),2)))
# TBATS model with weekly and yearly seasonality
dat.ts2 <- sqrt(msts(dat[!is.na(dat$pm2.5),"pm2.5"], seasonal.periods=c(7,365.25)))
fit5 <- tbats(dat.ts2)
# This method takes the most time when comparing run time.
# Down side on this is that you cannot set specific box-cox, ARMA, and fourier parameters.
# Predict future values
plot(predict(fit5, h=25),xlim=c(4.8,5.2))
# Plot the residuals
tbats.mod <- data.frame(day=1:sum(!is.na(dat.ts2)),resid=as.numeric(residuals(fit5)))
ggplot(tbats.mod,aes(day,resid)) +
geom_point() + geom_smooth() +
ggtitle("TBATS Resids with Dual Seasonality",
subtitle = paste0("Auto RMSE: ",round(sqrt(mean((residuals(fit5))^2)),2)))
# ARIMA with weekly and yearly seasonality with regressors
regs <- dat[!is.na(dat$pm2.5),c("precip","wind","inversion_diff","fireworks")]
# Forecast weather regressors
weather.ts <- msts(dat[,c("precip","wind","inversion_diff")],seasonal.periods = c(7,365.25))
precip <- auto.arima(weather.ts[,1])
fprecip <- as.numeric(data.frame(forecast(precip,h=25))$Point.Forecast)
wind <- auto.arima(weather.ts[,2])
fwind <- as.numeric(data.frame(forecast(wind,h=25))$Point.Forecast)
inversion <- auto.arima(weather.ts[,3])
finversion <- as.numeric(data.frame(forecast(inversion,h=25))$Point.Forecast)
fregs <- data.frame(precip=fprecip,wind=fwind,inversion=as.numeric(finversion<0),fireworks=0)
# Seasonality
z <- fourier(dat.ts2, K=c(2,5))
zf <- fourier(dat.ts2, K=c(2,5), h=25)
# Fit the model
fit <- auto.arima(dat.ts2, xreg=cbind(z,regs), seasonal=FALSE)
# Predict Future Values
# This time we need future values of the regressors as well.
fc <- forecast(fit, xreg=cbind(zf,fregs), h=25)
plot(fc,xlim=c(4.8,5.2))
# Plot the residuals
arima.mod <- data.frame(day=1:sum(!is.na(dat.ts)),resid=as.numeric(residuals(fit)))
ggplot(arima.mod,aes(day,resid)) +
geom_point() + geom_smooth() +
ggtitle("Arima Resids with Seasonality and Regressors",
subtitle = paste0("RMSE: ",round(sqrt(mean((residuals(fit))^2)),2)))
pdat <- data.frame(ds=dat$date,
y=sqrt(dat$pm2.5),
precip=dat$precip,
wind=dat$wind,
inversion_diff=dat$inversion_diff,
inversion=dat$inversion_,
fireworks=dat$fireworks)
# Forecast weather regressors
pfdat <- data.frame(ds=max(dat$date) + 1:25)
pprecip <- pdat %>%
select(ds,y=precip) %>%
prophet() %>%
predict(pfdat)
## Initial log joint probability = -5.77805
## Optimization terminated normally:
## Convergence detected: relative gradient magnitude is below tolerance
pwind <- pdat %>%
select(ds,y=wind) %>%
prophet() %>%
predict(pfdat)
## Initial log joint probability = -46.5575
## Optimization terminated normally:
## Convergence detected: relative gradient magnitude is below tolerance
pinversion <- pdat %>%
select(ds,y=inversion_diff) %>%
prophet() %>%
predict(pfdat)
## Initial log joint probability = -55.0515
## Optimization terminated normally:
## Convergence detected: relative gradient magnitude is below tolerance
fdat <- data.frame(ds=pfdat$ds,
precip=pprecip$yhat,
wind=pwind$yhat,
inversion=as.numeric(pinversion$yhat<0),
fireworks = 0)
# Fit the model (Seasonality automatically determined)
fit6 <- prophet() %>%
add_regressor('precip') %>%
add_regressor('wind') %>%
add_regressor('inversion') %>%
add_regressor('fireworks') %>%
fit.prophet(pdat)
## Initial log joint probability = -120.752
## Optimization terminated normally:
## Convergence detected: relative gradient magnitude is below tolerance
# Forecast future values
forecast <- predict(fit6, fdat)
# Get the residuals
fpred <- predict(fit6)
fpred$ds <- as.Date(fpred$ds)
fpred <- pdat %>% left_join(fpred,by="ds")
fpred$resid <- fpred$y - fpred$yhat
# Plot the residuals
ggplot(fpred,aes(ds,resid)) +
geom_point() + geom_smooth() +
ggtitle("Prophet with Seasonality and Regressors",
subtitle = paste0("RMSE: ",round(sqrt(mean(fpred$resid^2)),2)))
# RMSE by cutoff
all.cv %>%
group_by(model,cutoff) %>%
summarise(rmse=sqrt(mean((y-yhat)^2))) %>%
ggplot(.,aes(x=cutoff,y=rmse,group=model,color=model)) +
geom_line(alpha=.75) + geom_point(alpha=.75)
# RMSE by horizon
all.cv %>%
group_by(model,day) %>%
summarise(rmse=sqrt(mean((y-yhat)^2))) %>%
ggplot(.,aes(x=day,y=rmse,group=model,color=model)) +
geom_line(alpha=.75) + geom_point(alpha=.75)
# Prediction behaviors of different methods
ggplot(all.cv,aes(date,yhat,group=as.factor(cutoff),color=as.factor(cutoff)))+
geom_line()+
geom_line(aes(y=y),color="black",alpha=.15)+#geom_point(aes(y=y),color="black",alpha=.15)+
facet_wrap(~model)+ guides(color="none") +
theme(axis.title.x=element_blank(),
axis.text.x=element_blank(),
axis.ticks.x=element_blank())